In [1]:
# Library for handling data
import pandas as pd

# Library used to split data into training and testing sets
from sklearn.model_selection import train_test_split

# Library for measuring accuracy of ML models
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

# Machine Learning model used
from sklearn.ensemble import RandomForestClassifier

# Library for visualization
import plotly.express as px
In [2]:
df = pd.read_csv('pbta_miRactivity_results.csv')
df
Out[2]:
Kids_First_Biospecimen_ID sample_id aliquot_id RNA_library short_histology molecular_subtype age_at_diagnosis_days reported_gender MIMAT0010195 MIMAT0004481 ... MIMAT0004986 MIMAT0004987 MIMAT0000094 MIMAT0026473 MIMAT0004510 MIMAT0000095 MIMAT0022842 MIMAT0000097 MIMAT0004678 MIMAT0000689
0 BS_GXTFW99H 7316-2151 740006 stranded HGAT HGG, H3 wildtype 2166 Male 0.203215 0.170353 ... 0.163825 0.141134 0.192549 0.153315 0.147092 0.299865 0.134360 0.044772 0.167571 0.105896
1 BS_HF3RNN2T 7316-2155 564491 stranded Medulloblastoma MB, Group4 3606 Female 0.183178 0.246241 ... 0.153730 0.147374 0.322979 0.119636 0.139716 0.185569 0.173047 0.180138 0.172517 0.151552
2 BS_MMNRZ1EQ 7316-2141 571446 stranded ATRT NaN 383 Male 0.056398 -0.019984 ... -0.130431 -0.140412 -0.111823 -0.140318 -0.142191 -0.041198 -0.148094 -0.080920 -0.018398 0.060411
3 BS_53GQKVEX 7316-487 588341 stranded LGAT LGG, KIAA1549-BRAF 553 Male 0.169361 0.223773 ... 0.145929 0.145370 0.298415 0.121007 0.138993 0.214281 0.166954 0.166200 0.155085 0.144268
4 BS_JS95PE0J 7316-1774 574549 stranded HGAT HGG, H3 wildtype, TP53 loss 2431 Male 0.132491 0.170572 ... 0.144303 0.113045 0.219452 0.115481 0.124098 0.231750 0.119749 -0.039174 0.145273 0.000353
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
239 BS_RCFQ31XF 7316-873 570111 stranded Ependymoma EPN, To be classified 442 Male 0.157141 0.125819 ... 0.086446 0.070290 0.126832 0.068954 0.074025 0.174126 0.074397 0.061975 0.110112 0.136656
240 BS_QXKRN6CR 7316-111 470423 stranded Ependymoma EPN, ST RELA 4519 Female 0.146616 0.118695 ... 0.045809 0.035370 0.069957 0.042077 0.038712 0.148805 0.029867 0.066198 0.084429 0.123514
241 BS_4FZS7TX4 7316-160 549587 stranded Ependymoma EPN, To be classified 1223 Male 0.194594 0.116444 ... 0.070706 0.075477 0.076592 0.069451 0.078228 0.165159 0.073962 0.113423 0.113134 0.197433
242 BS_XQ5SFW35 7316-89 577715 stranded HGAT HGG, H3 wildtype 1461 Male 0.107664 0.102398 ... 0.100223 0.099694 0.141058 0.088554 0.105310 0.213635 0.098756 0.099611 0.059582 0.113228
243 BS_GS1KR5PK 7316-1772 717124 stranded Medulloblastoma MB, SHH 1221 Male 0.209188 0.094796 ... 0.036446 0.036185 0.015965 0.039582 0.038798 0.153996 0.027754 0.072525 0.116735 0.214161

244 rows × 2058 columns

In [3]:
hdf = df.drop(['Kids_First_Biospecimen_ID', 'sample_id', 'aliquot_id', 'RNA_library', 'molecular_subtype', 'age_at_diagnosis_days', 'reported_gender'], axis=1)
hdf
Out[3]:
short_histology MIMAT0010195 MIMAT0004481 MIMAT0000062 MIMAT0000063 MIMAT0026472 MIMAT0000064 MIMAT0004484 MIMAT0000065 MIMAT0004485 ... MIMAT0004986 MIMAT0004987 MIMAT0000094 MIMAT0026473 MIMAT0004510 MIMAT0000095 MIMAT0022842 MIMAT0000097 MIMAT0004678 MIMAT0000689
0 HGAT 0.203215 0.170353 0.044772 0.100227 0.228451 0.044772 0.118220 0.044772 0.013709 ... 0.163825 0.141134 0.192549 0.153315 0.147092 0.299865 0.134360 0.044772 0.167571 0.105896
1 Medulloblastoma 0.183178 0.246241 0.180138 0.162924 0.157944 0.180138 0.201304 0.180138 0.155110 ... 0.153730 0.147374 0.322979 0.119636 0.139716 0.185569 0.173047 0.180138 0.172517 0.151552
2 ATRT 0.056398 -0.019984 -0.080920 -0.121344 -0.050051 -0.080920 -0.097578 -0.080920 0.017420 ... -0.130431 -0.140412 -0.111823 -0.140318 -0.142191 -0.041198 -0.148094 -0.080920 -0.018398 0.060411
3 LGAT 0.169361 0.223773 0.166200 0.160803 0.155500 0.166200 0.206727 0.166200 0.156179 ... 0.145929 0.145370 0.298415 0.121007 0.138993 0.214281 0.166954 0.166200 0.155085 0.144268
4 HGAT 0.132491 0.170572 -0.039174 -0.012450 0.191062 -0.039174 0.025425 -0.039174 0.029123 ... 0.144303 0.113045 0.219452 0.115481 0.124098 0.231750 0.119749 -0.039174 0.145273 0.000353
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
239 Ependymoma 0.157141 0.125819 0.061975 0.082500 0.131669 0.061975 0.106920 0.061975 0.106418 ... 0.086446 0.070290 0.126832 0.068954 0.074025 0.174126 0.074397 0.061975 0.110112 0.136656
240 Ependymoma 0.146616 0.118695 0.066198 0.063461 0.127475 0.066198 0.096739 0.066198 0.086737 ... 0.045809 0.035370 0.069957 0.042077 0.038712 0.148805 0.029867 0.066198 0.084429 0.123514
241 Ependymoma 0.194594 0.116444 0.113423 0.120964 0.112428 0.113423 0.143909 0.113423 0.159605 ... 0.070706 0.075477 0.076592 0.069451 0.078228 0.165159 0.073962 0.113423 0.113134 0.197433
242 HGAT 0.107664 0.102398 0.099611 0.126305 0.113148 0.099611 0.160924 0.099611 0.100055 ... 0.100223 0.099694 0.141058 0.088554 0.105310 0.213635 0.098756 0.099611 0.059582 0.113228
243 Medulloblastoma 0.209188 0.094796 0.072525 0.093812 0.118819 0.072525 0.119084 0.072525 0.168033 ... 0.036446 0.036185 0.015965 0.039582 0.038798 0.153996 0.027754 0.072525 0.116735 0.214161

244 rows × 2051 columns

In [4]:
# Split the dataset into features and labels
X = hdf.loc[:, hdf.columns != 'short_histology'].values
y = hdf.loc[:, hdf.columns == 'short_histology'].values.ravel()

# Split PBTA data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

#Sanity check
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
(183, 2050) (61, 2050) (183,) (61,)
In [5]:
# Initialize random forest classifier
rfc = RandomForestClassifier(max_depth=10, random_state=0)

# Train the random forest classifier
rfc.fit(X_train, y_train)

# Make predictions using random forest classifier
rfc_y_pred = rfc.predict(X_test)

# Accuracy of model
print(f'Accuracy: {accuracy_score(y_test, rfc_y_pred)}')
Accuracy: 0.3770491803278688
In [6]:
# Calculate a confusion matrix
cm = confusion_matrix(y_test, rfc_y_pred, labels=rfc.classes_)

# Display confusion matrix to look at how accurately the ML model was able to classify each tumor type
disp = px.imshow(cm, text_auto=True,
                labels=dict(x="Predicted Tumor Type", y="True Tumor Type", color="Productivity"),
                x=df['short_histology'].unique().tolist(),
                y=df['short_histology'].unique().tolist()
                )
disp.show()
In [7]:
# Library for UMAP projections
from umap.umap_ import UMAP
In [8]:
umap_2d = UMAP(n_components=2, init='random', random_state=0)
umap_3d = UMAP(n_components=3, init='random', random_state=0)

proj_2d = umap_2d.fit_transform(X)
proj_3d = umap_3d.fit_transform(X)

fig_2d = px.scatter(proj_2d, x=0, y=1, color=hdf['short_histology'].tolist())

fig_3d = px.scatter_3d(proj_3d, x=0, y=1, z=2, color=hdf['short_histology'].tolist())

fig_2d.show()
fig_3d.show()